In [2]:
%matplotlib inline

import pandas as pd
from pymongo import MongoClient
import matplotlib.pyplot as plt
from matplotlib import dates
import numpy as np
from pandas.tseries.resample import TimeGrouper
from pandas.tseries.offsets import DateOffset
import vincent as v


db = MongoClient('localhost',27017)['twitter_db']

# Make a query to the specific DB and Collection
collection = db['twitter_collection']

Tweets per Language


In [3]:
cursor = collection.aggregate(
    [
        {"$group": {"_id": "$lang", "count": {"$sum": 1}}},
        { "$sort": { "count": -1 }}
    ]
)
langFrame = pd.DataFrame(list(cursor))
langFrame.columns = ['lang','tweets']
langFrame = langFrame[langFrame['lang'] != 'und']
langFrame[:10].plot(x='lang',y='tweets',kind='bar',logy=True,color='red')
plt.xlabel('Language')
plt.ylabel('Number of Tweets')
plt.title('Tweets per Language')
plt.savefig('language_graph.png',dpi=180)


Tweets over Time


In [4]:
#Select 'created_at'
cursor = collection.find({},{"created_at":1})
dateFrame = pd.DataFrame(list(cursor))

#Convert to datetime
dateFrame['created_at'] = pd.to_datetime(dateFrame['created_at'])
dateFrame.set_index('created_at', drop=False, inplace=True)
dateFrame.index = dateFrame.index.tz_localize('GMT').tz_convert('EST')
dateFrame.index = dateFrame.index - DateOffset(hours = 12)
dateFrame1m = dateFrame['created_at'].resample('1t', how='count')

In [5]:
#Visualize
fig = plt.figure(figsize=(6,4))

plt.title("Tweets over Time")
plt.ylabel("Tweets per Minute")
fig.figsize = [6,4]
ax = dateFrame1m.plot()
ax.text("2016-02-25 09:42:00-05:00",2500,'Clearly, we max out the Twitter Streaming API Rate',horizontalalignment='center')
plt.xlabel("Time")
plt.savefig('tweets_over_time.png',dpi=180)


Tweets Referencing Canditates


In [6]:
def get_resampled_data(candidate_name):
    candidateFrame = pd.DataFrame()
    cursor = collection.find({'text': {'$regex': candidate_name,'$options':'i'}},{"created_at":1})
    candidateFrame['created_at'] = pd.to_datetime(pd.DataFrame(list(cursor)).created_at)
    candidateFrame.set_index('created_at', drop=False, inplace=True)
    candidateFrame.index = candidateFrame.index.tz_localize('GMT').tz_convert('EST')
    candidateFrame.index = candidateFrame.index - DateOffset(hours = 12)
    per_minute = candidateFrame['created_at'].resample('1t', how='count')
    return per_minute

candidate_per_minute_data = dict()
candidates = ['Trump','Rubio','Kasich','Carson','Cruz']
for name in candidates:
    candidate_per_minute_data[name] = get_resampled_data(name.lower())
    print "Finished " + name


Finished Trump
Finished Rubio
Finished Kasich
Finished Carson
Finished Cruz

In [7]:
fig = plt.figure(figsize=(8,6))
plt.xticks(rotation=30)

#Visualize
plt.title("Tweets per Candidate")
plt.ylabel("Tweets per Minute")
for name in candidates:
    plt.plot(candidate_per_minute_data[name].index.to_pydatetime(),candidate_per_minute_data[name])
plt.rcParams["figure.figsize"] = [15,9]

plt.legend(candidates, loc='upper right',ncol=3)
plt.xlabel("Time")
plt.savefig('tweets_per_candidate.png',dpi=180)


Carson Analytics


In [8]:
#Visualize
fig = plt.figure(figsize=(8,6))

plt.title("Ben Carson Tweets during 2/25 GOP Debate")
plt.ylabel("Tweets per Minute")
plt.plot(candidate_per_minute_data['Carson'].index.to_pydatetime(),candidate_per_minute_data['Carson'])
plt.xticks(rotation=30)

plt.xlabel("Time")
#plt.text("2016-02-25 10:20:00-05:00",1320, '"Can somebody attack me, please?"', horizontalalignment='center')
plt.annotate('"... fruit salad of life ..."',xy=("2016-02-25 9:33:00-05:00",580),
             xytext=("2016-02-25 9:32:00-05:00",750),horizontalalignment='center',
             arrowprops=dict(facecolor='black', shrink=0.05, frac=.2))
plt.annotate('"Can somebody attack me, please?"',xy=("2016-02-25 10:31:00-05:00",1290),
             xytext=("2016-02-25 10:20:00-05:00",1320),horizontalalignment='right',
             arrowprops=dict(facecolor='black', shrink=0.05, frac=.2))
plt.annotate('"... the abyss of destruction ..."',xy=("2016-02-25 8:43:00-05:00",980),
             xytext=("2016-02-25 8:45:00-05:00",1100),horizontalalignment='left',
             arrowprops=dict(facecolor='black', shrink=0.05, frac=.2))
plt.savefig('carson_tweets.png',dpi=180)


Stack Graph of Candidate Tweets


In [9]:
fig = plt.figure(figsize=(8,6))

plt.title("Tweets per Candidate")
plt.ylabel("Tweets per Minute")
plt.xlabel("Time")

fullDataFrame = pd.DataFrame()
plt.xticks(rotation=30)

for name in candidates:
    fullDataFrame[name] = candidate_per_minute_data[name]
polys = plt.stackplot(fullDataFrame.index,fullDataFrame.Trump,fullDataFrame.Rubio,fullDataFrame.Cruz,
              fullDataFrame.Kasich,fullDataFrame.Carson)
legendProxies = []
for poly in polys:
    legendProxies.append(plt.Rectangle((0, 0), 1, 1, fc=poly.get_facecolor()[0]))
    
plt.legend(legendProxies, ['Trump','Rubio','Cruz','Kasich','Carson'],ncol=3)

plt.savefig('stacked_tweets_over_time.png',dpi=180)


/usr/local/lib/python2.7/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

Tweet Totals


In [10]:
totalsFrame = pd.DataFrame()
totalsFrame['names'] = candidates
totalsFrame['Total Tweets'] = [candidate_per_minute_data[name].sum() for name in candidates]
totalsFrame = totalsFrame.sort_values(by='Total Tweets',ascending=False)

totalsFrame.plot(x='names',kind='bar',color=['b','g','r','purple','lightblue'],figsize=(7,4))
plt.xlabel('Candidate')
plt.ylabel('Number of Tweets')
plt.title('Total Tweets over Debate')
plt.xticks(rotation=0)
plt.savefig('total_tweets.png',dpi=300)


Coordinate Data

I tried to get Matplotlib's Basemap library to work, but I couldn't get it to cooperate with Jupyter. Given that of the ~400k tweets I only got ~400 points, it seemed rather pointless to graph them


In [11]:
coordinates_df = pd.DataFrame(list(collection.find({'coordinates':{'$exists':True,'$ne': None}})))
lats,longs = list(),list()
for _dict in coordinates_df['coordinates']:
    if 'coordinates' in _dict:
        lats.append(_dict['coordinates'][0])
        longs.append(_dict['coordinates'][1])
avg_lat = pd.DataFrame(lats).median()[0]
avg_lon = pd.DataFrame(longs).median()[0]

Hashtag Analysis


In [59]:
from nltk.corpus import stopwords
import string
 
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via']

def get_hashtags(text):
    terms = text.split()
    return [i for i in terms if i.startswith("#")]

hashtag_dict = dict()
i = 0
for tweet in collection.find():
    i += 1
    for tag in get_hashtags(tweet['text']):
        tag = tag.encode('ascii', 'ignore')
        if tag in hashtag_dict.keys():
            hashtag_dict[tag] += 1
        else:
            hashtag_dict[tag] = 1
    if i >= 10000:
        break
hashtag_df = pd.DataFrame(columns=['name','freq'])
tags = hashtag_dict.keys()
hashtag_df['name'] = tags
hashtag_df['freq'] = [hashtag_df[i] for i in tags]
print hashtag_df.describe()


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-59-3dbc17ed57a1> in <module>()
     24 tags = hashtag_dict.keys()
     25 hashtag_df['name'] = tags
---> 26 counts = list(hashtag_df[i] for i in tags)
     27 hashtag_df['freq'] = counts
     28 print hashtag_df.describe()

<ipython-input-59-3dbc17ed57a1> in <genexpr>((i,))
     24 tags = hashtag_dict.keys()
     25 hashtag_df['name'] = tags
---> 26 counts = list(hashtag_df[i] for i in tags)
     27 hashtag_df['freq'] = counts
     28 print hashtag_df.describe()

/usr/local/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   1967             return self._getitem_multilevel(key)
   1968         else:
-> 1969             return self._getitem_column(key)
   1970 
   1971     def _getitem_column(self, key):

/usr/local/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key)
   1974         # get column
   1975         if self.columns.is_unique:
-> 1976             return self._get_item_cache(key)
   1977 
   1978         # duplicate columns & possible reduce dimensionality

/usr/local/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
   1089         res = cache.get(item)
   1090         if res is None:
-> 1091             values = self._data.get(item)
   1092             res = self._box_item_values(item, values)
   1093             cache[item] = res

/usr/local/lib/python2.7/site-packages/pandas/core/internals.pyc in get(self, item, fastpath)
   3209 
   3210             if not isnull(item):
-> 3211                 loc = self.items.get_loc(item)
   3212             else:
   3213                 indexer = np.arange(len(self.items))[isnull(self.items)]

/usr/local/lib/python2.7/site-packages/pandas/core/index.pyc in get_loc(self, key, method, tolerance)
   1757                                  'backfill or nearest lookups')
   1758             key = _values_from_object(key)
-> 1759             return self._engine.get_loc(key)
   1760 
   1761         indexer = self.get_indexer([key], method=method,

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:3979)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:3843)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12265)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12216)()

KeyError: '#truth'

In [ ]:


In [ ]: